In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import histogram
import plotly.io as pio
import plotly.express as px
import plotly.figure_factory as ff
from langdetect import detect
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor  
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,VotingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from category_encoders.binary import BinaryEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import r2_score
from xgboost import XGBClassifier
import warnings 
warnings.filterwarnings("ignore")

%matplotlib inline
sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.3)
In [12]:
'''
This dataset comprises more than 3000  hotel reviews collected from various countries.
this dataset collected on the 22 September 2023 and its contains Reviews from 2012 to 2023.

I am sharing a glimpse of a work I completed scraping hundreds of hotels' reviews. 

The data was obtained through web scraping techniques using Selenium and BeautifulSoup.
 
'''
df = pd.read_csv("F:/Local Disk (D)/AI-python-EPSLION/final project 2 nov 2023/my datasett/my dataset/DataSet/global_hotel_reviews.csv")
df.head(5)
Out[12]:
Rating Date Description Hotel_name City Country
0 NaN 18-09-2023 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France
1 10.0 5/9/2023 NaN barriere-le-majestic Cannes France
2 8.0 31-10-2022 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France
3 10.0 3/9/2022 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France
4 10.0 7/12/2021 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France
In [13]:
"""
# using df.info() to check how many values is null so here Description  has 3810 - 3799 
= 11 missing values


Country = 3810 - 3796 = 14 missing value

Rating = 3810 - 3793 = 17 missing value
"""
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3810 entries, 0 to 3809
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rating       3793 non-null   float64
 1   Date         3810 non-null   object 
 2   Description  3799 non-null   object 
 3   Hotel_name   3810 non-null   object 
 4   City         3810 non-null   object 
 5   Country      3796 non-null   object 
dtypes: float64(1), object(5)
memory usage: 178.7+ KB
In [14]:
"""
use this commnd to get a descriptive statistics summary of a given dataframe.
describe command for categorical values only.

the most common hotel to be visited is Grand Fiesta Americana in Mexico country particularly in Cancun.


"""
df.describe(include = 'all')
Out[14]:
Rating Date Description Hotel_name City Country
count 3793.000000 3810 3799 3810 3810 3796
unique NaN 1730 3694 6 5 3
top NaN 22-08-2022 Excellent Grand Fiesta Americana Cancun Mexico
freq NaN 11 9 979 1592 1586
mean 8.851041 NaN NaN NaN NaN NaN
std 1.893585 NaN NaN NaN NaN NaN
min 2.000000 NaN NaN NaN NaN NaN
25% 8.000000 NaN NaN NaN NaN NaN
50% 10.000000 NaN NaN NaN NaN NaN
75% 10.000000 NaN NaN NaN NaN NaN
max 10.000000 NaN NaN NaN NaN NaN
In [15]:
#check dublication 
df.duplicated().sum()
Out[15]:
54
In [16]:
#removing all the duplication before treating with missing data.

df.drop_duplicates(inplace=True)
In [17]:
df.duplicated().sum()
Out[17]:
0
In [18]:
'''
Make some of feature engineering to extract  (day and month and year and time) seperately from dates.

here i make new variable called (NewDate) in order to amend format date.

then i will drop column "Date".

'''
df['NewDate'] = pd.to_datetime(df.Date,format='mixed')

df.NewDate.value_counts()
Out[18]:
NewDate
2022-05-07    10
2022-09-07    10
2023-03-09    10
2023-09-05    10
2023-09-18     9
              ..
2020-11-08     1
2023-04-25     1
2021-05-01     1
2021-11-01     1
2023-07-27     1
Name: count, Length: 1730, dtype: int64
In [19]:
df['NewDate']
Out[19]:
0      2023-09-18
1      2023-05-09
2      2022-10-31
3      2022-03-09
4      2021-07-12
          ...    
3805   2021-04-08
3806   2021-02-08
3807   2021-02-08
3808   2021-02-08
3809   2021-02-08
Name: NewDate, Length: 3756, dtype: datetime64[ns]
In [20]:
df.head(5)
Out[20]:
Rating Date Description Hotel_name City Country NewDate
0 NaN 18-09-2023 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18
1 10.0 5/9/2023 NaN barriere-le-majestic Cannes France 2023-05-09
2 8.0 31-10-2022 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31
3 10.0 3/9/2022 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09
4 10.0 7/12/2021 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12
In [21]:
#then i will drop column "Date".
df.drop('Date', axis=1, inplace=True)
df.head(10)
Out[21]:
Rating Description Hotel_name City Country NewDate
0 NaN Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18
1 10.0 NaN barriere-le-majestic Cannes France 2023-05-09
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12
5 10.0 Un sejour plus que parfait dans un hôtel d'exc... barriere-le-majestic Cannes France 2021-11-21
6 10.0 Parfait, une fois sur place, nous avons été su... barriere-le-majestic Cannes NaN 2021-10-17
7 10.0 Un automne incroyable on appelle ça un palace ... barriere-le-majestic Cannes France 2021-03-09
8 10.0 Beautiful hotel, kind and lovely staff , amazi... barriere-le-majestic Cannes NaN 2021-06-21
9 10.0 Personnel très accueillant barriere-le-majestic Cannes France 2021-04-06
In [22]:
df.NewDate.value_counts()
Out[22]:
NewDate
2022-05-07    10
2022-09-07    10
2023-03-09    10
2023-09-05    10
2023-09-18     9
              ..
2020-11-08     1
2023-04-25     1
2021-05-01     1
2021-11-01     1
2023-07-27     1
Name: count, Length: 1730, dtype: int64
In [23]:
# null data here in description and country and rating.
df.isna().sum()
Out[23]:
Rating         17
Description    11
Hotel_name      0
City            0
Country        14
NewDate         0
dtype: int64
In [24]:
''' if you want to know the null and missing values for particular feature wrtie the below command.
here there is 11 missing values in Description.

in country has 14 missing values.
''' 
df['Description'].isnull().sum()
Out[24]:
11
In [25]:
df['Country'].isnull().sum()
Out[25]:
14
In [26]:
'''
here to deal with missing values :
fill missng data.
For Categorical Data ==> Fill with Mode (Most frequent).
for numerical data ==> fill with mean or median depend on is there any outliers or not.

by using this command you can know which most country visited more by people is {Mexico}.

'''
df['Country'].mode()
Out[26]:
0    Mexico
Name: Country, dtype: object
In [27]:
'''
here i make fillna to Country with mode which is most frequent value so data that were missed 
is now be replaced with Country "Mexico" that being visited more.
'''
df['Country'].fillna(df['Country'].mode()[0], inplace=True)
In [28]:
'''
to make sure kindly write below command in order to check if still are missing values or not.

so its observed that the country not having any missing values.
'''
df['Country'].isnull().sum()
Out[28]:
0
In [29]:
df['Description'].mode()
Out[29]:
0    Excellent
Name: Description, dtype: object
In [30]:
'''
here i make fillna to Description with mode which is most frequent value so data that were missed 
is now be replaced with Description "Excellent" .
'''
df['Description'].fillna(df['Description'].mode()[0], inplace=True)
In [31]:
df['Description'].isnull().sum()
Out[31]:
0
In [32]:
'''
 Rating ==>numerical data ==> how to deal missing values in this column?
 
 1) confirm if there is outliers by using boxplot first.
 2) if there are outliers ; the fillna will made using Median (as median not being affected by outliers).
 3) if there are not outliers ; the fillna can be made by both mean and median.
 ===> Here in box plot ; i will accept outliers who lower than lowerfence as its not affected 3810.
as outliers here is  119 record and its normal vaues so its not affected ; anyway i will fillna with median.

but if the ouliers was affect data so in this case you can drop outliers by using IQR.
 '''
Out[32]:
'\n Rating ==>numerical data ==> how to deal missing values in this column?\n \n 1) confirm if there is outliers by using boxplot first.\n 2) if there are outliers ; the fillna will made using Median (as median not being affected by outliers).\n 3) if there are not outliers ; the fillna can be made by both mean and median.\n ===> Here in box plot ; i will accept outliers who lower than lowerfence as its not affected 3810.\nas outliers here is  119 record and its normal vaues so its not affected ; anyway i will fillna with median.\n\nbut if the ouliers was affect data so in this case you can drop outliers by using IQR.\n '
In [33]:
  
fig = px.box(df, y='Rating')


fig.show()
In [34]:
'''
If ouliers was affect data by using IQR using the below commands:

IQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field.

IQR = Quartile3 – Quartile1

Q1 = 8
Q3 = 10

lower = 5
upper = 13

# IQR
Q1 = df['Rating'].quantile(0.25)  
Q3 = df['Rating'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR

upper_array = np.where(df['Rating']>=upper)[0]
lower_array = np.where(df['Rating']<=lower)[0]
Removing the outliers

df = df.drop(index=upper_array, inplace=True,axis=1)
df.drop(index=lower_array,inplace=True,axis=1)

but here i will accept outliers as they are 119 from 3810 so its accepted so the fillna with mean will be occured.

'''
Out[34]:
"\nIf ouliers was affect data by using IQR using the below commands:\n\nIQR (Inter Quartile Range) Inter Quartile Range approach to finding the outliers is the most commonly used and most trusted approach used in the research field.\n\nIQR = Quartile3 – Quartile1\n\nQ1 = 8\nQ3 = 10\n\nlower = 5\nupper = 13\n\n# IQR\nQ1 = df['Rating'].quantile(0.25)  \nQ3 = df['Rating'].quantile(0.75)\nIQR = Q3 - Q1\nlower = Q1 - 1.5*IQR\nupper = Q3 + 1.5*IQR\n\nupper_array = np.where(df['Rating']>=upper)[0]\nlower_array = np.where(df['Rating']<=lower)[0]\nRemoving the outliers\n\ndf = df.drop(index=upper_array, inplace=True,axis=1)\ndf.drop(index=lower_array,inplace=True,axis=1)\n\nbut here i will accept outliers as they are 119 from 3810 so its accepted so the fillna with mean will be occured.\n\n"
In [35]:
'''
here median of rating is 10 but the result of df['Rating'].mean() is equal to "8.854774003744318" so if i fillna with mean
this is means that the missing value in Rating will be "8.854774003744318" and there is no Rating review float it be intiger
so i will fillna with median.

'''
df['Rating'].median()
Out[35]:
10.0
In [36]:
'''
i can make fillna to Rating with mean or meadian as outliers is not affect.

here i make fillna to Rating with median .
'''
df['Rating'].fillna(df['Rating'].median(), inplace=True)
In [37]:
df.isna().sum()
Out[37]:
Rating         0
Description    0
Hotel_name     0
City           0
Country        0
NewDate        0
dtype: int64
In [38]:
'''
first record previoulsy was missing value so now its replaced with median which is 10 in this case
'''
df['Rating']
Out[38]:
0       10.0
1       10.0
2        8.0
3       10.0
4       10.0
        ... 
3805     6.0
3806     8.0
3807     8.0
3808    10.0
3809    10.0
Name: Rating, Length: 3756, dtype: float64
In [39]:
df
Out[39]:
Rating Description Hotel_name City Country NewDate
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12
... ... ... ... ... ... ...
3805 6.0 Localização e praia são os diferenciais. Atend... InterContinental Presidente Cancun Mexico 2021-04-08
3806 8.0 Excellent InterContinental Presidente Cancun Mexico 2021-02-08
3807 8.0 Sistema de café da manhã e happy hour mal expl... InterContinental Presidente Cancun Mexico 2021-02-08
3808 10.0 Playa espectacular InterContinental Presidente Cancun Mexico 2021-02-08
3809 10.0 Presidente Intercontinental was a great experi... InterContinental Presidente Cancun Mexico 2021-02-08

3756 rows × 6 columns

In [40]:
'''
from here i observe the less rating is (2) and rating (4).

and the more Rating is (10) and (8)

'''
df.Rating.value_counts()
Out[40]:
Rating
10.0    2421
8.0      838
6.0      278
4.0      129
2.0       90
Name: count, dtype: int64
In [41]:
'''
1) first univariate:
here i use univariate plots using histogram.
from here i observe the less rating is (2) and rating (4).

and the more Rating is (10) and (8)
'''

fig = px.histogram(df,x='Rating')
fig.show()
In [42]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3756 entries, 0 to 3809
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Rating       3756 non-null   float64       
 1   Description  3756 non-null   object        
 2   Hotel_name   3756 non-null   object        
 3   City         3756 non-null   object        
 4   Country      3756 non-null   object        
 5   NewDate      3756 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 205.4+ KB
In [43]:
'''
The most attarctive hotel is "Grand Fiesta Americana" that located in [Mexico] particularly in 'Cancun'.

the second hotel attart visitors is "Warwick Geneva" that located in [Switzerland] and  particularly in 'Geneva'.

the less attarctive hotels is "Hyatt Regency Palais"  that located in [France] particularly in 'NICE' and  

another bad hotel in [france] also which is "Fairmont Monte Carlo" in "monaco" city.

'''
df.Hotel_name.value_counts()
Out[43]:
Hotel_name
Grand Fiesta Americana         979
Warwick Geneva                 646
InterContinental Presidente    581
barriere-le-majestic           574
Fairmont Monte Carlo           495
Hyatt Regency Palais           481
Name: count, dtype: int64
In [44]:
'''
2) second univariate:


The most attarctive hotel is "Grand Fiesta Americana" that located in [Mexico] particularly in 'Cancun'.

the second hotel attart visitors is "Warwick Geneva" that located in [Switzerland] and  particularly in 'Geneva'.

the less attarctive hotels is "Hyatt Regency Palais"  that located in [France] particularly in 'NICE' and  

another bad hotel in [france] also which is "Fairmont Monte Carlo" in "monaco" city.
'''
fig = px.histogram(df,x='Hotel_name',color_discrete_sequence = ['green'])
fig.show()
In [45]:
'''
The city that is considered as the most tourist destination is [Cancun] in 'Mexico' followed by 
[Geneva] in 'Switzerland'

on the other hand the less cities that considered as tourist destination is [Nice] and [Monaco] in france.
'''
df.City.value_counts()
Out[45]:
City
Cancun    1560
Geneva     646
Cannes     574
Monaco     495
Nice       481
Name: count, dtype: int64
In [46]:
'''
3) third univariate:

The city that is considered as the most tourist destination is [Cancun] in 'Mexico' followed by 
[Geneva] in 'Switzerland'

on the other hand the less cities that considered as tourist destination is [Nice] and [Monaco] in france.
'''
fig = px.histogram(df,x='City',color_discrete_sequence = ['red'])
fig.show()
In [47]:
'''
From the prevoius analysis we already be know that that most tourist destination country is "Mexico" 
and the less one is "Switzerland"
'''
df.Country.value_counts()
Out[47]:
Country
Mexico         1568
France         1547
Switzerland     641
Name: count, dtype: int64
In [48]:
'''
4) forth univariate:
The most tourist destination country is "Mexico" 
and the less one is "Switzerland"
'''
fig = px.histogram(df,x='Country',color_discrete_sequence = ['grey'])
fig.show()
In [49]:
'''
from here i need to make features engineers to extract day and month and year in order to analysis data meaning that
i want to know which the more year and day and month and season that being better to visit the mentioned countries.
'''
df['NewDate']
Out[49]:
0      2023-09-18
1      2023-05-09
2      2022-10-31
3      2022-03-09
4      2021-07-12
          ...    
3805   2021-04-08
3806   2021-02-08
3807   2021-02-08
3808   2021-02-08
3809   2021-02-08
Name: NewDate, Length: 3756, dtype: datetime64[ns]
In [50]:
df['Day'] = df['NewDate'].apply(lambda x: str(x).split(' ')[0].split('-')[-1])
df.head(5)
Out[50]:
Rating Description Hotel_name City Country NewDate Day
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 09
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 09
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12
In [51]:
'''
Extract month ==> feature engineer
'''
df['Month'] = df['NewDate'].apply(lambda x: str(x).split(' ')[0].split('-')[1])
df.head(5)
Out[51]:
Rating Description Hotel_name City Country NewDate Day Month
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18 09
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 09 05
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31 10
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 09 03
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12 07
In [52]:
df['Month']
Out[52]:
0       09
1       05
2       10
3       03
4       07
        ..
3805    04
3806    02
3807    02
3808    02
3809    02
Name: Month, Length: 3756, dtype: object
In [53]:
#here i convert Month to be numeric in order to extrat feature engineer which is season so in function i have to use numeric values.
df["Month"]= pd.to_numeric(df["Month"],errors='coerce')

df.head(5)
Out[53]:
Rating Description Hotel_name City Country NewDate Day Month
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18 9
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 09 5
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31 10
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 09 3
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12 7
In [54]:
def months(x):
    if x in [12, 1, 2]:
        return 'Winter'
    elif x in [3, 4, 5]:
        return 'Spring'
    elif x in [6, 7, 8]:
        return 'Summer'
    elif x in [9, 10, 11]:
        return 'Autumn'

df['Season'] = df['Month'].apply(months)
df.head(5)
Out[54]:
Rating Description Hotel_name City Country NewDate Day Month Season
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18 9 Autumn
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 09 5 Spring
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31 10 Autumn
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 09 3 Spring
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12 7 Summer
In [55]:
'''
Extract year ==> feature engineer
'''
df['year'] = df['NewDate'].apply(lambda x: str(x).split(' ')[0].split('-')[0])
df.head(5)
Out[55]:
Rating Description Hotel_name City Country NewDate Day Month Season year
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18 9 Autumn 2023
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 09 5 Spring 2023
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31 10 Autumn 2022
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 09 3 Spring 2022
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12 7 Summer 2021
In [56]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3756 entries, 0 to 3809
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Rating       3756 non-null   float64       
 1   Description  3756 non-null   object        
 2   Hotel_name   3756 non-null   object        
 3   City         3756 non-null   object        
 4   Country      3756 non-null   object        
 5   NewDate      3756 non-null   datetime64[ns]
 6   Day          3756 non-null   object        
 7   Month        3756 non-null   int64         
 8   Season       3756 non-null   object        
 9   year         3756 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 322.8+ KB
In [57]:
#here i convert day and year to be numeric values.
df["Day"]= pd.to_numeric(df["Day"],errors='coerce')
df["year"]= pd.to_numeric(df["year"],errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3756 entries, 0 to 3809
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Rating       3756 non-null   float64       
 1   Description  3756 non-null   object        
 2   Hotel_name   3756 non-null   object        
 3   City         3756 non-null   object        
 4   Country      3756 non-null   object        
 5   NewDate      3756 non-null   datetime64[ns]
 6   Day          3756 non-null   int64         
 7   Month        3756 non-null   int64         
 8   Season       3756 non-null   object        
 9   year         3756 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 322.8+ KB
In [58]:
df.Season.value_counts()
Out[58]:
Season
Summer    1206
Autumn     935
Spring     889
Winter     726
Name: count, dtype: int64
In [59]:
''''
5) fifth univariate:
here the preffered season is summer followed by autumn.
and the worst season for tourism is winter.

'''
fig = px.histogram(df,x='Season',color_discrete_sequence = ['purple'])
fig.show()
In [60]:
df.head()
Out[60]:
Rating Description Hotel_name City Country NewDate Day Month Season year
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18 9 Autumn 2023
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 9 5 Spring 2023
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31 10 Autumn 2022
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 9 3 Spring 2022
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12 7 Summer 2021
In [61]:
'''
more visitors in the end of first week and [second week]  month while less tourist in the start of the first week of month.
'''
df.Day.value_counts()
Out[61]:
Day
7     194
9     184
8     175
19    146
6     145
23    137
5     137
18    136
21    136
28    130
24    127
14    126
22    123
26    120
13    119
20    118
25    117
30    117
15    115
4     115
1     112
16    111
27    111
10    109
29    102
12     92
11     88
17     85
31     83
3      74
2      72
Name: count, dtype: int64
In [62]:
df_sort_day = df.sort_values(by='Day')
In [63]:
''''
6) six univariate:

I need to support static exploration about Day which people travel more by using seaborn countplot.

more visitors in the seventh and ninth day of the month.
while the less visitors be in second and third day of the month.

'''
sns.countplot(x='Day',data=df_sort_day)
Out[63]:
<Axes: xlabel='Day', ylabel='count'>
In [64]:
df_sort_month = df.sort_values(by='Month')
In [65]:
df.Month.value_counts()
Out[65]:
Month
8     452
7     417
9     362
6     337
10    320
4     317
3     296
5     276
11    253
2     253
12    245
1     228
Name: count, dtype: int64
In [66]:
'''
7) seven univariate:
here the most months that attract visitors is [August and July] ; while the less month in terms of visit tourists is [Junuary 
and december].
'''
sns.countplot(x='Month',data=df_sort_month)
Out[66]:
<Axes: xlabel='Month', ylabel='count'>
In [67]:
df.year.value_counts()
Out[67]:
year
2022    995
2023    814
2018    476
2019    475
2021    319
2017    194
2015    171
2020    147
2016    121
2014     23
2013     20
2012      1
Name: count, dtype: int64
In [68]:
df_sort_year = df.sort_values(by='year')
In [69]:
'''
8) eight univariate:
     visitors travelled more in 2022 and 2023 while less visitors travel in 2012 and 2013.
'''
fig = px.histogram(df_sort_year,x='year',color_discrete_sequence = ['purple'])
fig.show()

Bivariate¶

In [70]:
'''
1) first bivariate:

here its observed that Mexico take the maximim rate while switzerland take less rating.
'''

sns.violinplot(x = 'Rating',y = 'Country',data = df)
Out[70]:
<Axes: xlabel='Rating', ylabel='Country'>
In [71]:
'''
2)  Second bivariate:

another way to visualize between rating and country 
here its observed that Mexico take the maximim rate while switzerland take less rating.
'''


px.bar(df,x='Rating',y='Country')
In [72]:
'''
3)  third bivariate:

here observed that hotel that take most of the rating is Grand Fiesta Americana" that 
located in [Mexico] particularly in 'Cancun' city.

while hotels that took less rating are :

"Hyatt Regency Palais"  that located in [France] particularly in 'NICE'.

and  "Fairmont Monte Carlo" in "monaco" city in [France].


'''
px.bar(df,x='Rating',y='Hotel_name')
In [73]:
'''
4)  four bivariate:
here the more rating in Cancun (mexico) while the less rating in 'Nice' and 'Monaco' in france.
'''
fig = px.bar(df,x='Rating',y='City')
fig.show()
In [74]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3756 entries, 0 to 3809
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Rating       3756 non-null   float64       
 1   Description  3756 non-null   object        
 2   Hotel_name   3756 non-null   object        
 3   City         3756 non-null   object        
 4   Country      3756 non-null   object        
 5   NewDate      3756 non-null   datetime64[ns]
 6   Day          3756 non-null   int64         
 7   Month        3756 non-null   int64         
 8   Season       3756 non-null   object        
 9   year         3756 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 322.8+ KB
In [75]:
'''
5)  five bivariate:
from here its observed that more rating in summer season in [Mexico.]
while less rating in season (Autumn) in [Switzerland].

'''
sns.catplot(data=df, x="Rating", y="Country", col="Season", aspect=.5)
Out[75]:
<seaborn.axisgrid.FacetGrid at 0x180939cae10>
In [76]:
'''
6)  six bivariate:

here in August the more visitors was in Mexico while less visiors in Switzerland in winter.
'''
sns.countplot(x='Month',hue='Country',data=df)
Out[76]:
<Axes: xlabel='Month', ylabel='count'>
In [77]:
'''
7)  seven bivariate:

here the more rating in seasons (Summer followed by Autumn)  and the less rating in season (winter).
'''
sns.countplot(x='Rating',hue='Season',data=df)
Out[77]:
<Axes: xlabel='Rating', ylabel='count'>
In [78]:
'''
8)  eight bivariate:
'''
px.density_heatmap(df,x='Rating',y='year')
In [79]:
'''
==> In Univariate i use 8 visualization :
6 histogram and two countplot(seaborn).

==> In Bivariate i use 8 visualization which are:
1- sns.violinplot ==> one
2- sns.catplot ==> one
3- px.bar ==> three 
4- countplot ==> two
5- px.density_heatmap (plotly) ==> one

==> use px.imshow to show coffecient correlation between Rating(target)and other features.
==> and to detect outliers i use two boxplot ==> that was accepted as its normal case.

and  i use 8 different types of visualization and 16 visualization mixed between Univariate and Bivariate.

Brief of this analysis:

- people preferred the hotel "Grand Fiesta Americana in Mexico country particularly in Cancun in summer season.

- they do not prefer "Hyatt Regency Palais"  that located in [France] particularly in 'NICE' and  also
"Fairmont Monte Carlo" hotel in "monaco" city in also france.


- the country that prefferd is Mexico then Frace then Switherland.

- People travel more in 2022 & 2023 while they didnt prefer to travel or went to hotels in 2012 &2013 and in my opinion that

with the passage of time , people realize the importance of travel and its impact to change your mood ; makes you more able to
be productve in your worl because after vacaion you feel satisfied and have energy that helps you get the work done.

- here the most months that attract visitors is [August and July] ; while the less month in terms of visit tourists is [Junuary 
and december].

'''
Out[79]:
'\n==> In Univariate i use 8 visualization :\n6 histogram and two countplot(seaborn).\n\n==> In Bivariate i use 8 visualization which are:\n1- sns.violinplot ==> one\n2- sns.catplot ==> one\n3- px.bar ==> three \n4- countplot ==> two\n5- px.density_heatmap (plotly) ==> one\n\n==> use px.imshow to show coffecient correlation between Rating(target)and other features.\n==> and to detect outliers i use two boxplot ==> that was accepted as its normal case.\n\nand  i use 8 different types of visualization and 16 visualization mixed between Univariate and Bivariate.\n\nBrief of this analysis:\n\n- people preferred the hotel "Grand Fiesta Americana in Mexico country particularly in Cancun in summer season.\n\n- they do not prefer "Hyatt Regency Palais"  that located in [France] particularly in \'NICE\' and  also\n"Fairmont Monte Carlo" hotel in "monaco" city in also france.\n\n\n- the country that prefferd is Mexico then Frace then Switherland.\n\n- People travel more in 2022 & 2023 while they didnt prefer to travel or went to hotels in 2012 &2013 and in my opinion that\n\nwith the passage of time , people realize the importance of travel and its impact to change your mood ; makes you more able to\nbe productve in your worl because after vacaion you feel satisfied and have energy that helps you get the work done.\n\n- here the most months that attract visitors is [August and July] ; while the less month in terms of visit tourists is [Junuary \nand december].\n\n'

Machine Learning¶

In [80]:
df
Out[80]:
Rating Description Hotel_name City Country NewDate Day Month Season year
0 10.0 Très bon hôtel comme attendu. On s’occupe de v... barriere-le-majestic Cannes France 2023-09-18 18 9 Autumn 2023
1 10.0 Excellent barriere-le-majestic Cannes France 2023-05-09 9 5 Spring 2023
2 8.0 personnel à l'écoute et agréable, mais les cha... barriere-le-majestic Cannes France 2022-10-31 31 10 Autumn 2022
3 10.0 Accueil chaleureux, bon service et bonne nourr... barriere-le-majestic Cannes France 2022-03-09 9 3 Spring 2022
4 10.0 Excellent Hotel bien situé, excellent service ... barriere-le-majestic Cannes France 2021-07-12 12 7 Summer 2021
... ... ... ... ... ... ... ... ... ... ...
3805 6.0 Localização e praia são os diferenciais. Atend... InterContinental Presidente Cancun Mexico 2021-04-08 8 4 Spring 2021
3806 8.0 Excellent InterContinental Presidente Cancun Mexico 2021-02-08 8 2 Winter 2021
3807 8.0 Sistema de café da manhã e happy hour mal expl... InterContinental Presidente Cancun Mexico 2021-02-08 8 2 Winter 2021
3808 10.0 Playa espectacular InterContinental Presidente Cancun Mexico 2021-02-08 8 2 Winter 2021
3809 10.0 Presidente Intercontinental was a great experi... InterContinental Presidente Cancun Mexico 2021-02-08 8 2 Winter 2021

3756 rows × 10 columns

In [81]:
'''
here i will drop Description , NewDate , Season
'''
df.drop('Description',axis=1,inplace = True)
df.drop('NewDate',axis=1,inplace = True)
df.drop('Season',axis=1,inplace = True)
In [82]:
'''
use binary encoder to convert categorical to numerical data.
'''
encoder = BinaryEncoder()
New_df = encoder.fit_transform(df[['Hotel_name']])
New_df
Out[82]:
Hotel_name_0 Hotel_name_1 Hotel_name_2
0 0 0 1
1 0 0 1
2 0 0 1
3 0 0 1
4 0 0 1
... ... ... ...
3805 1 1 0
3806 1 1 0
3807 1 1 0
3808 1 1 0
3809 1 1 0

3756 rows × 3 columns

In [83]:
df = pd.concat([df,New_df] , axis = 1 )
In [84]:
df.head()
Out[84]:
Rating Hotel_name City Country Day Month year Hotel_name_0 Hotel_name_1 Hotel_name_2
0 10.0 barriere-le-majestic Cannes France 18 9 2023 0 0 1
1 10.0 barriere-le-majestic Cannes France 9 5 2023 0 0 1
2 8.0 barriere-le-majestic Cannes France 31 10 2022 0 0 1
3 10.0 barriere-le-majestic Cannes France 9 3 2022 0 0 1
4 10.0 barriere-le-majestic Cannes France 12 7 2021 0 0 1
In [85]:
df.drop('Hotel_name',axis=1,inplace = True)
In [86]:
df
Out[86]:
Rating City Country Day Month year Hotel_name_0 Hotel_name_1 Hotel_name_2
0 10.0 Cannes France 18 9 2023 0 0 1
1 10.0 Cannes France 9 5 2023 0 0 1
2 8.0 Cannes France 31 10 2022 0 0 1
3 10.0 Cannes France 9 3 2022 0 0 1
4 10.0 Cannes France 12 7 2021 0 0 1
... ... ... ... ... ... ... ... ... ...
3805 6.0 Cancun Mexico 8 4 2021 1 1 0
3806 8.0 Cancun Mexico 8 2 2021 1 1 0
3807 8.0 Cancun Mexico 8 2 2021 1 1 0
3808 10.0 Cancun Mexico 8 2 2021 1 1 0
3809 10.0 Cancun Mexico 8 2 2021 1 1 0

3756 rows × 9 columns

In [87]:
df=pd.get_dummies(df,columns=['City','Country'],drop_first=True)
In [88]:
df.head(1000)
Out[88]:
Rating Day Month year Hotel_name_0 Hotel_name_1 Hotel_name_2 City_Cannes City_Geneva City_Monaco City_Nice Country_Mexico Country_Switzerland
0 10.0 18 9 2023 0 0 1 True False False False False False
1 10.0 9 5 2023 0 0 1 True False False False False False
2 8.0 31 10 2022 0 0 1 True False False False False False
3 10.0 9 3 2022 0 0 1 True False False False False False
4 10.0 12 7 2021 0 0 1 True False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1002 10.0 4 10 2018 0 1 0 False False True False False False
1003 8.0 4 9 2018 0 1 0 False False True False False False
1004 8.0 4 2 2018 0 1 0 False False True False False False
1005 10.0 4 2 2018 0 1 0 False False True False False False
1006 8.0 4 1 2018 0 1 0 False False True False False False

1000 rows × 13 columns

In [89]:
'''
here make rateing as intiger in order to be intiger number not continuous (not to be 3.5 as a rating)
'''
df["Rating"]= df["Rating"].astype('int32')
In [90]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 3756 entries, 0 to 3809
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Rating               3756 non-null   int32
 1   Day                  3756 non-null   int64
 2   Month                3756 non-null   int64
 3   year                 3756 non-null   int64
 4   Hotel_name_0         3756 non-null   int64
 5   Hotel_name_1         3756 non-null   int64
 6   Hotel_name_2         3756 non-null   int64
 7   City_Cannes          3756 non-null   bool 
 8   City_Geneva          3756 non-null   bool 
 9   City_Monaco          3756 non-null   bool 
 10  City_Nice            3756 non-null   bool 
 11  Country_Mexico       3756 non-null   bool 
 12  Country_Switzerland  3756 non-null   bool 
dtypes: bool(6), int32(1), int64(6)
memory usage: 242.1 KB
In [91]:
df["Rating"].value_counts()
Out[91]:
Rating
10    2421
8      838
6      278
4      129
2       90
Name: count, dtype: int64

High Correlation Filter¶

In [92]:
'''
This technique to display the correlation coefficients for different variables.

this tool to  identify and visualize patterns in the given data

'''

corr = df.corr()
corr
Out[92]:
Rating Day Month year Hotel_name_0 Hotel_name_1 Hotel_name_2 City_Cannes City_Geneva City_Monaco City_Nice Country_Mexico Country_Switzerland
Rating 1.000000 -0.037824 -0.006829 0.069821 0.001417 -0.048482 0.153446 0.025296 -0.097972 -0.075092 0.046688 0.077281 -0.098913
Day -0.037824 1.000000 0.055903 -0.035756 0.011670 -0.001068 -0.017698 -0.027459 0.033057 0.002970 0.009368 -0.010508 0.029743
Month -0.006829 0.055903 1.000000 -0.079366 -0.052631 0.083725 -0.031830 -0.014821 -0.027512 0.048011 0.044900 -0.033569 -0.024199
year 0.069821 -0.035756 -0.079366 1.000000 0.489032 0.047524 0.075642 -0.433572 -0.247107 -0.155508 -0.096209 0.676796 -0.246884
Hotel_name_0 0.001417 0.011670 -0.052631 0.489032 1.000000 -0.366062 -0.234024 -0.506691 0.382031 -0.464798 -0.457197 0.706308 0.380245
Hotel_name_1 -0.048482 -0.001068 0.083725 0.047524 -0.366062 1.000000 -0.392813 -0.357386 -0.383502 0.463015 0.455444 -0.075610 -0.381708
Hotel_name_2 0.153446 -0.017698 -0.031830 0.075642 -0.234024 -0.392813 1.000000 0.390793 -0.495330 -0.423434 0.352621 0.143973 -0.493014
City_Cannes 0.025296 -0.027459 -0.014821 -0.433572 -0.506691 -0.357386 0.390793 1.000000 -0.193572 -0.165475 -0.162769 -0.355045 -0.192666
City_Geneva -0.097972 0.033057 -0.027512 -0.247107 0.382031 -0.383502 -0.495330 -0.193572 1.000000 -0.177567 -0.174664 -0.378667 0.995323
City_Monaco -0.075092 0.002970 0.048011 -0.155508 -0.464798 0.463015 -0.423434 -0.165475 -0.177567 1.000000 -0.149312 -0.329820 -0.176737
City_Nice 0.046688 0.009368 0.044900 -0.096209 -0.457197 0.455444 0.352621 -0.162769 -0.174664 -0.149312 1.000000 -0.324426 -0.173847
Country_Mexico 0.077281 -0.010508 -0.033569 0.676796 0.706308 -0.075610 0.143973 -0.355045 -0.378667 -0.329820 -0.324426 1.000000 -0.384016
Country_Switzerland -0.098913 0.029743 -0.024199 -0.246884 0.380245 -0.381708 -0.493014 -0.192666 0.995323 -0.176737 -0.173847 -0.384016 1.000000
In [93]:
'''
here i use  high correlation filter and from the below its show that there is better correlation between year and Rating than day
and month so i will drop day and month as thier correlation with rating equal negative .

'''
fig=px.imshow(corr,text_auto=True)
fig.update_layout(width=1000,height=800)
In [94]:
df.drop('Day',axis=1,inplace = True)
df.drop('Month',axis=1,inplace = True)
In [95]:
'''
Target here is Rating which is numeric.
'''
x=df.drop('Rating',axis=1)
y = df['Rating']
In [96]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0,test_size=0.2)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

HyperParameter Tunning¶

HyperParameter Tunning directly affects model performance.

==> each model has its hyperparameter and each parameter has number of different values so the question now which the best values of this parameter that achieve the best performance and accuracy.

HyperParameter Tunning such as : GridsearchCV and Randomized Search.

1) GridsearchCV: ==> i will use this teqnique (more accuracy here on model SVC as its the better model result. GridSearchCV exhaustively considers all parameter combinations.

GridsearchCV advantage: more accuracy than Randomized Search as in Gridsearch try all the possibilities in value range and runnig models depend on number of possiblilties.

GridsearchCV disadvantage: take alot of time and high cost.

The GridSearchCV instance implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.

2) Randomized Search: here try random values and learn random models.

Randomized Search advantage: cost and time redduction.

Randomized Search disadvantage: less accuracy than GridsearchCV .

In [97]:
'''
HyperParameter Tunning (GridSearch)
1) For SVC
'''
model=SVC()
params = [
        {'C':[1, 10], 'kernel':['linear', 'sigmoid', 'poly'],'random_state':range(0,10)},
        {'C':[1, 10], 'kernel':['rbf'], 'gamma':[0.5, 0.6, 0.7, 0.1, 0.01, 0.01],'random_state':range(0,10)}
         ]
grid_search_svc=GridSearchCV(estimator=model,
                        param_grid=params,
                        scoring='accuracy',
                        n_jobs=-1)
grid_search_svc.fit(x_train,y_train)
Out[97]:
GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
                          'random_state': range(0, 10)},
                         {'C': [1, 10],
                          'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
                          'kernel': ['rbf'], 'random_state': range(0, 10)}],
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
                          'random_state': range(0, 10)},
                         {'C': [1, 10],
                          'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
                          'kernel': ['rbf'], 'random_state': range(0, 10)}],
             scoring='accuracy')
SVC()
SVC()
In [98]:
'''
best parms here is that when C:1 and gamma eq 0.1 and kernel eq 'rbf ==>default'
'''
grid_search_svc.best_params_
Out[98]:
{'C': 1, 'gamma': 0.1, 'kernel': 'rbf', 'random_state': 0}
In [99]:
'''
here SVC best score is 0.6411453133666112
'''
grid_search_svc.best_score_
Out[99]:
0.6411453133666112
In [100]:
'''
HyperParameter Tunning (GridSearch)
2) For Logistic Regression
'''
grid={"C":np.logspace(1,3,10), "penalty":["l1","l2"]}# l1 lasso l2 ridge

logreg=LogisticRegression()

logreg_cv=GridSearchCV(logreg,grid,cv=10)

logreg_cv.fit(x_train,y_train)
Out[100]:
GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([  10.        ,   16.68100537,   27.82559402,   46.41588834,
         77.42636827,  129.1549665 ,  215.443469  ,  359.38136638,
        599.48425032, 1000.        ]),
                         'penalty': ['l1', 'l2']})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': array([  10.        ,   16.68100537,   27.82559402,   46.41588834,
         77.42636827,  129.1549665 ,  215.443469  ,  359.38136638,
        599.48425032, 1000.        ]),
                         'penalty': ['l1', 'l2']})
LogisticRegression()
LogisticRegression()
In [101]:
logreg_cv.best_params_
Out[101]:
{'C': 10.0, 'penalty': 'l2'}
In [102]:
logreg_cv.best_score_
Out[102]:
0.6401417497231451
In [103]:
'''
HyperParameter Tunning (GridSearch)
3) For KNN
'''
knn_classifer=KNeighborsClassifier()
params = [{'n_neighbors': [3, 5, 7, 9],
         'weights': ['uniform', 'distance'],
         'algorithm':['ball_tree','kd_tree','brute'],
         'metric':['cityblock','cosine','euclidean','l1','l2','haversine','manhattan','nan_euclidean','minkowski'],
         'leaf_size': [15, 40]}]
grid_search_knn = GridSearchCV(knn_classifer,
                      param_grid=params,
                      scoring='accuracy')
grid_search_knn.fit(x_train, y_train)
Out[103]:
GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                          'leaf_size': [15, 40],
                          'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
                                     'l2', 'haversine', 'manhattan',
                                     'nan_euclidean', 'minkowski'],
                          'n_neighbors': [3, 5, 7, 9],
                          'weights': ['uniform', 'distance']}],
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
                          'leaf_size': [15, 40],
                          'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
                                     'l2', 'haversine', 'manhattan',
                                     'nan_euclidean', 'minkowski'],
                          'n_neighbors': [3, 5, 7, 9],
                          'weights': ['uniform', 'distance']}],
             scoring='accuracy')
KNeighborsClassifier()
KNeighborsClassifier()
In [104]:
grid_search_knn.best_params_
Out[104]:
{'algorithm': 'brute',
 'leaf_size': 15,
 'metric': 'cosine',
 'n_neighbors': 7,
 'weights': 'uniform'}
In [105]:
grid_search_knn.best_score_
Out[105]:
0.6221741541874654
In [106]:
'''
after using GridSearchCV over models: SVC , Logistic Regression, KNN.
The result is that SVC has the best score.
'''
Out[106]:
'\nafter using GridSearchCV over models: SVC , Logistic Regression, KNN.\nThe result is that SVC has the best score.\n'
In [107]:
from imblearn.over_sampling import RandomOverSampler
sampler =RandomOverSampler()
x,y = sampler.fit_resample(x_train,y_train)
In [108]:
'''
when use classificaion the result being better than using rgression as i explained below in the below comment.

'''

models={
    'log_reg':LogisticRegression(),
    'KNN':KNeighborsClassifier(n_neighbors=5),
    'SVC':SVC(),
    'NB':GaussianNB(),
    'DT':DecisionTreeClassifier(),
    'RF':RandomForestClassifier(n_estimators=25,n_jobs=-1),
    'Bagging_classifier':BaggingClassifier(DecisionTreeClassifier(),n_estimators=5,n_jobs=-1),
    'voting': VotingClassifier(estimators=[('LR',LogisticRegression()),('NB',GaussianNB()),('DT',DecisionTreeClassifier())])
}
In [110]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,precision_score
import joblib
for name,model in models.items():
    print('--------- ',name,'-------------')
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print('accuracy_training: ',accuracy_score(y_train,model.predict(x_train)))
    print('accuracy_testing: ',accuracy_score(y_pred,y_test))
    print('confusion matrix: ',confusion_matrix(y_pred,y_test))
    print('recall score: ',recall_score(y_pred,y_test,average='micro'))
    print('precision score: ',precision_score(y_pred,y_test,average='micro'))
    joblib.dump(model,name+'_model.h5')
    print('-'*30)

'''
the better result from classification is SVC.
'''
---------  log_reg -------------
accuracy_training:  0.6431424766977364
accuracy_testing:  0.6555851063829787
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   1   7   9]
 [ 19  25  42 163 486]]
recall score:  0.6555851063829787
precision score:  0.6555851063829787
------------------------------
---------  KNN -------------
accuracy_training:  0.5282956058588548
accuracy_testing:  0.4867021276595745
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   1   1]
 [  9   5  17  55 183]
 [ 10  20  26 114 311]]
recall score:  0.4867021276595745
precision score:  0.4867021276595745
------------------------------
---------  SVC -------------
accuracy_training:  0.6411451398135819
accuracy_testing:  0.6582446808510638
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   0   0]
 [ 19  25  43 170 495]]
recall score:  0.6582446808510638
precision score:  0.6582446808510638
------------------------------
---------  NB -------------
accuracy_training:  0.6208388814913449
accuracy_testing:  0.6356382978723404
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   0   0]
 [  3   6  12  44  61]
 [ 16  19  31 126 434]]
recall score:  0.6356382978723404
precision score:  0.6356382978723404
------------------------------
---------  DT -------------
accuracy_training:  0.6464713715046605
accuracy_testing:  0.648936170212766
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   1   2]
 [  2   2   1  19  24]
 [ 17  23  42 150 469]]
recall score:  0.648936170212766
precision score:  0.648936170212766
------------------------------
---------  RF -------------
accuracy_training:  0.6461384820239681
accuracy_testing:  0.6476063829787234
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   1   2]
 [  1   1   1   9  15]
 [ 18  24  42 160 478]]
recall score:  0.6476063829787234
precision score:  0.6476063829787234
------------------------------
---------  Bagging_classifier -------------
accuracy_training:  0.6448069241011984
accuracy_testing:  0.6529255319148937
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   1   2]
 [  2   4   1  26  28]
 [ 17  21  42 143 465]]
recall score:  0.6529255319148937
precision score:  0.6529255319148937
------------------------------
---------  voting -------------
accuracy_training:  0.6451398135818908
accuracy_testing:  0.651595744680851
confusion matrix:  [[  0   0   0   0   0]
 [  0   0   0   0   0]
 [  0   0   0   0   0]
 [  2   2   1  19  24]
 [ 17  23  42 151 471]]
recall score:  0.651595744680851
precision score:  0.651595744680851
------------------------------
Out[110]:
'\nthe better result from classification is SVC.\n'
In [189]:
'''
make this command in order to get features that i will use after that in deployment.
'''
df.columns
Out[189]:
Index(['Rating', 'year', 'Hotel_name_0', 'Hotel_name_1', 'Hotel_name_2',
       'City_Cannes', 'City_Geneva', 'City_Monaco', 'City_Nice',
       'Country_Mexico', 'Country_Switzerland'],
      dtype='object')
In [190]:
features = ['Rating', 'year', 'Hotel_name_0', 'Hotel_name_1', 'Hotel_name_2',
       'City_Cannes', 'City_Geneva', 'City_Monaco', 'City_Nice',
       'Country_Mexico', 'Country_Switzerland']
In [191]:
'''
this to having feature and scaler that help me in deployment.
'''
joblib.dump(features,'features.h5')
joblib.dump(scaler,'scaler.h5')
Out[191]:
['scaler.h5']
In [603]:
'''
when using regression by using below commands ; the result was so bad and there is accuracy  was negative .

so i decide to use
classification so in this case results be fine as per above and the better logorithm is SVC.


models={
    'log_reg':LogisticRegression(),
    
    'SVC':SVC(),
    'DT':DecisionTreeRegressor(),
    'RF':RandomForestRegressor(n_estimators=25,n_jobs=-1),
    'Bagging_classifier':BaggingRegressor(DecisionTreeRegressor(),n_estimators=5,n_jobs=-1),
    'xgboost':GradientBoostingRegressor(random_state=0),
  
}

for name,model in models.items():
    print('--------- ',name,'-------------')
    model.fit(x_train,y_train)
    
    print('R2 train score: ',model.score(x_train,y_train))
    print('R2_test score: ',model.score(x_test,y_test))
    
    print('-'*30)
    
    


'''
Out[603]:
"\nwhen using regression by using below commands ; the result was so bad and KNN accuracy was negative and also when using \nthe regression i was need to drop day , year , month and this will make app so poor.\n\nso i decide to use\nclassification so in this case results be fine as per above and the better logorithm is SVC.\n\n\nmodels={\n    'log_reg':LogisticRegression(),\n    \n    'SVC':SVC(),\n    'DT':DecisionTreeRegressor(),\n    'RF':RandomForestRegressor(n_estimators=25,n_jobs=-1),\n    'Bagging_classifier':BaggingRegressor(DecisionTreeRegressor(),n_estimators=5,n_jobs=-1),\n    'xgboost':GradientBoostingRegressor(random_state=0),\n  \n}\n\nfor name,model in models.items():\n    print('--------- ',name,'-------------')\n    model.fit(x_train,y_train)\n    \n    print('R2 train score: ',model.score(x_train,y_train))\n    print('R2_test score: ',model.score(x_test,y_test))\n    \n    print('-'*30)\n    \n    \n\n\n"
In [192]:
df
Out[192]:
Rating year Hotel_name_0 Hotel_name_1 Hotel_name_2 City_Cannes City_Geneva City_Monaco City_Nice Country_Mexico Country_Switzerland
0 10 2023 0 0 1 True False False False False False
1 10 2023 0 0 1 True False False False False False
2 8 2022 0 0 1 True False False False False False
3 10 2022 0 0 1 True False False False False False
4 10 2021 0 0 1 True False False False False False
... ... ... ... ... ... ... ... ... ... ... ...
3805 6 2021 1 1 0 False False False False True False
3806 8 2021 1 1 0 False False False False True False
3807 8 2021 1 1 0 False False False False True False
3808 10 2021 1 1 0 False False False False True False
3809 10 2021 1 1 0 False False False False True False

3756 rows × 11 columns

summary ML¶

In [ ]:
'''
- high correlation filter result ==> there is better correlation between year and Rating while
has bad correlation between day and month so i will drop them.

- HyperParmeter Tunning Result ==> use GridsearchCV as it has more accuracy ; and the result after using it is that 
the best alogorithm is SVC.

- aftr applying several algorithm the better model is SVC as it has more accuracy_training and accuracy_testing.


'''